The dataset on police incidents in the City of Seattle includes a collection of events related to 911 emergency calls recorded from 2010 to the present day. The dataset contains approximately 6 million records and has a total size of 5.6GB. It is an open dataset obtained from the official website of the City of Seattle and is available at the following link.
This dataset contains information about calls made to the police and the corresponding police responses in Seattle. The columns in the dataset are as follows:
The dataset provides detailed information on all calls received by the police, including call types, priorities, call and response times, as well as the geographic locations of the incidents. It also includes the initial and final classification of each call, enabling analysis of how incidents are categorized and resolved.
Given the size of the dataset (5.6 GB), the data is loaded and further transformed using Apache Spark tools.
To ensure the correct data types are used, a schema is specified according to which Spark loads the data frames.
schema <- list(
CAD_Event_Number = "character",
Event_Clearance_Description = "character",
Call_Type = "character",
Priority = "character",
Initial_Call_Type = "character",
Final_Call_Type = "character",
Original_Time_Queued = "character",
Arrived_Time = "character",
Precinct = "character",
Sector = "character",
Beat = "character",
Blurred_Longitude = "numeric",
Blurred_Latitude = "numeric"
)
df <- spark_read_csv(sc, name = "police_calls", path = "Call_Data.csv", header = TRUE, columns = schema)
df_clean <- df %>% na.omit()
## * Dropped 3180987 rows with 'na.omit' (10491323 => 7310336)
df_clean <- df_clean %>%
mutate(
Original_Time_Queued = to_timestamp(Original_Time_Queued, "MM/dd/yyyy hh:mm:ss a"),
Arrived_Time = to_timestamp(Arrived_Time, "yyyy MMM dd hh:mm:ss a")
)
df_clean <- df_clean %>%
filter(!is.na(Original_Time_Queued) & !is.na(Arrived_Time) & Original_Time_Queued < Arrived_Time)
rmarkdown::render(“Documentation.Rmd”)
df_clean %>% summarise(
min_original_time = min(Original_Time_Queued, na.rm = TRUE),
max_original_time = max(Original_Time_Queued, na.rm = TRUE),
min_arrived_time = min(Arrived_Time, na.rm = TRUE),
max_arrived_time = max(Arrived_Time, na.rm = TRUE)
) %>% collect()
## # A tibble: 1 × 4
## min_original_time max_original_time min_arrived_time
## <dttm> <dttm> <dttm>
## 1 2009-06-02 03:43:08 2025-10-07 23:58:31 2009-06-02 04:01:55
## # ℹ 1 more variable: max_arrived_time <dttm>
precinct_counts <- df_clean %>%
group_by(Precinct) %>%
summarise(Count = n()) %>%
collect()
ggplot(precinct_counts, aes(x = Precinct, y = Count, fill = Precinct)) +
geom_bar(stat = "identity") +
labs(title = "Distribution by Police Precincts",
x = "Police Precinct",
y = "Number of Occurrences",
fill = "Precinct")
rm(precinct_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3838983 205.1 7690634 410.8 7690634 410.8
## Vcells 44218908 337.4 125562000 958.0 210419010 1605.4
clearence_counts <- df_clean %>%
group_by(Event_Clearance_Description) %>%
summarise(Count = n()) %>%
collect()
clearence_counts <- clearence_counts %>% filter(Count >= 5000)
ggplot(clearence_counts, aes(x = Event_Clearance_Description, y = Count, fill = Event_Clearance_Description)) +
geom_bar(stat = "identity") +
labs(title = "Distribution by Case Resolution Type",
x = "Resolution",
y = "Number of Occurrences",
fill = "Resolution Type") +
theme(axis.text.x = element_blank())
rm(clearence_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3846740 205.5 7690634 410.8 7690634 410.8
## Vcells 44237523 337.6 125562000 958.0 210419010 1605.4
priority_counts <- df_clean %>%
group_by(Priority) %>%
summarise(Count = n())
ggplot(priority_counts, aes(x = Priority, y = Count, fill = Priority)) +
geom_bar(stat = "identity") +
labs(title = "Distribution by Priority",
x = "Priority",
y = "Number of Occurrences",
fill = "Priority")
rm(priority_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3841343 205.2 7690634 410.8 7690634 410.8
## Vcells 44224779 337.5 125562000 958.0 210419010 1605.4
call_type_counts <- df_clean %>%
group_by(Call_Type) %>%
summarise(Count = n())
call_type_counts <- call_type_counts %>% filter(Count > 5000)
ggplot(call_type_counts, aes(x = Call_Type, y = Count, fill = Call_Type)) +
geom_bar(stat = "identity") +
labs(title = "Distribution by Call Method",
x = "Call Method",
y = "Number of Occurrences",
fill = "Call Method") +
theme(axis.text.x = element_blank())
rm(call_type_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3837436 205.0 7690634 410.8 7690634 410.8
## Vcells 44215069 337.4 125562000 958.0 210419010 1605.4
location_sample <- df_clean %>%
select(Blurred_Longitude, Blurred_Latitude) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
ggplot(data = location_sample, aes(x = Blurred_Longitude)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Distribution of Geographic Latitude",
x = "Geographic Latitude",
y = "Frequency")
rm(location_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3833309 204.8 7690634 410.8 7690634 410.8
## Vcells 49979516 381.4 125562000 958.0 210419010 1605.4
location_sample <- df_clean %>%
select(Blurred_Longitude, Blurred_Latitude) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
ggplot(data = location_sample, aes(x = Blurred_Latitude)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Distribution of Geographic Longitude",
x = "Geographic Longitude",
y = "Frequency")
rm(location_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3833313 204.8 7690634 410.8 7690634 410.8
## Vcells 49978232 381.4 125562000 958.0 210419010 1605.4
df_clean_ll <- df_clean %>% filter(Blurred_Latitude >= 47 & Blurred_Latitude <= 48 & Blurred_Longitude >= -123 & Blurred_Longitude <= -122)
# Sample 10% of cleaned location data for histogram
location_clean_sample <- df_clean_ll %>%
select(Blurred_Longitude, Blurred_Latitude) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
ggplot(data = location_clean_sample, aes(x = Blurred_Longitude)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Distribution of Cleaned Longitude",
x = "Longitude",
y = "Frequency")
rm(location_clean_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3833487 204.8 7690634 410.8 7690634 410.8
## Vcells 49728113 379.4 125562000 958.0 210419010 1605.4
# Sample 10% of cleaned location data for histogram
location_clean_sample <- df_clean_ll %>%
select(Blurred_Longitude, Blurred_Latitude) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
ggplot(data = location_clean_sample, aes(x = Blurred_Latitude)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Distribution of Cleaned Latitude",
x = "Latitude",
y = "Frequency")
rm(location_clean_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3833565 204.8 7690634 410.8 7690634 410.8
## Vcells 49729897 379.5 125562000 958.0 210419010 1605.4
# Setting the API key value
register_stadiamaps("04c1350e-f51f-4265-b458-ad6b6a3192bb", write = TRUE)
# Creating a map of Seattle
seattle <- c(left = -122.45, bottom = 47.48, right = -122.2, top = 47.73)
seattle_map <- get_stadiamap(seattle, zoom = 18)
# Plotting the map
ggmap(seattle_map) +
geom_point(data = df_clean_ll, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = Final_Call_Type)) +
labs(title = "Map of Seattle City",
x = "Longitude",
y = "Latitude",
color = "Final Call Type")
include_graphics("Images/PointsOnTheMap.png")
df_collected <- df_clean_ll %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>%
collect()
print(length(unique(df_collected$Initial_Call_Type)))
## [1] 268
print(length(unique(df_collected$Final_Call_Type)))
## [1] 245
df_clean_ll <- df_clean_ll %>%
mutate(Initial_Category = case_when(
grepl("ASLT|Assault|ASSAULT|ASSAULTS|HARRASMENT|THREAT|THREATS|WEAPON|GUN|PANHANDLING|HARASSMENT|VIOLENT", Initial_Call_Type) ~ "Assaults and Threats",
grepl("TRAFFICING|SEX|RAPE|PORNOGRAPHY|PROSTITUTION|LEWD|PROWLER", Initial_Call_Type) ~ "Sex Offenses",
grepl("NARCOTICS|DRUGS|MARIJUANA|OVERDOSE|OD|LIQUOR|DETOX|INTOX|LIQ", Initial_Call_Type) ~ "Narcotics",
grepl("HARBOR|ANIMAL|GAMBLING|WATER|TREES|NORAD|STADIUM|ILLEGAL DUMPING|SLEEPER|HAZ|BIAS|NUISANCE|URINATING|HOSPITAL|PHONE|CROWD|EVENT|DEMONSTRATIONS|DISTURBANCE|UNUSUAL|NOISE|POWER|LANDLINE|LITTERING", Initial_Call_Type) ~ "Civil incidents and security",
grepl("DOA|SHOTS|CASUALTY|FELONY|SUSPICIOUS|ESCAPE|FIRE|PURSUIT|SWAT|SHOOTING|SUICIDE|HOSTAGE|HOMICIDE", Initial_Call_Type) ~ "Emergency and Critical incidents",
grepl("ROBBERY|BURGLARY|PROPERTY|THEFT|BREAKING|SHOPLIFT|ARSON|TRESPASS|BURG|BURN|EXPLOSION|FRAUD", Initial_Call_Type) ~ "Property Crimes",
grepl("ALARM|ORDER|INSPECTION|WATCH", Initial_Call_Type) ~ "Alarm and Security",
grepl("ASSIST|CHECK|HELP|ASSIGNED|PATROL", Initial_Call_Type) ~ "Assistance and Checks",
grepl("DOMESTIC|ABUSE|CUSTODIAL|ARGUMENTS|DV", Initial_Call_Type) ~ "Domestic Violence",
grepl("Traffic|VIOLATIONS|ACCIDENT|MVC|CAR|DUI|TRAF|ROAD|VEHICLE|DUI|ACC|HIT AND RUN|", Initial_Call_Type) ~ "Traffic Incident",
grepl("MISSING|AWOL|FOUND|RUNAWAY|ABDUCTION|KIDNAP|CHILD|JUVENILE|LOST|AMBER|A.W.O.L.", Initial_Call_Type) ~ "Missing Persons",
grepl("OBS", Initial_Call_Type) ~ "Observation",
grepl("CANCELLED|NO ANSWER|OUT AT RANGE", Initial_Call_Type) ~ "No action",
TRUE ~ "Other"
))
df_clean_ll <- df_clean_ll %>%
mutate(Final_Category = case_when(
grepl("ASLT|Assault|ASSAULT|ASSAULTS|HARRASMENT|THREAT|THREATS|WEAPON|GUN|PANHANDLING|HARASSMENT|VIOLENT", Final_Call_Type) ~ "Assaults and Threats",
grepl("TRAFFICING|SEX|RAPE|PORNOGRAPHY|PROSTITUTION|LEWD|PROWLER", Final_Call_Type) ~ "Sex Offenses",
grepl("NARCOTICS|DRUGS|MARIJUANA|OVERDOSE|OD|LIQUOR|DETOX|INTOX|LIQ", Final_Call_Type) ~ "Narcotics",
grepl("HARBOR|ANIMAL|GAMBLING|WATER|TREES|NORAD|STADIUM|ILLEGAL DUMPING|SLEEPER|HAZ|BIAS|NUISANCE|URINATING|HOSPITAL|PHONE|CROWD|EVENT|DEMONSTRATIONS|DISTURBANCE|UNUSUAL|NOISE|POWER|LANDLINE|LITTERING", Final_Call_Type) ~ "Civil incidents and security",
grepl("DOA|SHOTS|CASUALTY|FELONY|SUSPICIOUS|ESCAPE|FIRE|PURSUIT|SWAT|SHOOTING|SUICIDE|HOSTAGE|HOMICIDE", Final_Call_Type) ~ "Emergency and Critical incidents",
grepl("ROBBERY|BURGLARY|PROPERTY|THEFT|BREAKING|SHOPLIFT|ARSON|TRESPASS|BURG|BURN|EXPLOSION|FRAUD", Final_Call_Type) ~ "Property Crimes",
grepl("ALARM|ORDER|INSPECTION|WATCH", Final_Call_Type) ~ "Alarm and Security",
grepl("ASSIST|CHECK|HELP|ASSIGNED|PATROL", Final_Call_Type) ~ "Assistance and Checks",
grepl("DOMESTIC|ABUSE|CUSTODIAL|ARGUMENTS|DV", Final_Call_Type) ~ "Domestic Violence",
grepl("Traffic|VIOLATIONS|ACCIDENT|MVC|CAR|DUI|TRAF|ROAD|VEHICLE|DUI|ACC|HIT AND RUN|", Final_Call_Type) ~ "Traffic Incident",
grepl("MISSING|AWOL|FOUND|RUNAWAY|ABDUCTION|KIDNAP|CHILD|JUVENILE|LOST|AMBER|A.W.O.L.", Final_Call_Type) ~ "Missing Persons",
grepl("OBS", Final_Call_Type) ~ "Observation",
grepl("CANCELLED|NO ANSWER|OUT AT RANGE", Final_Call_Type) ~ "No action",
TRUE ~ "Other"
))
same_diff_counts <- df_clean_ll %>%
group_by(Same_Category = ifelse(Initial_Category == Final_Category, "Same", "Different")) %>%
summarise(Count = n())
# Bar chart for displaying values
ggplot(same_diff_counts, aes(x = Same_Category, y = Count, fill = Same_Category)) +
geom_bar(stat = "identity") +
labs(title = "Comparison of Initial and Final Categories",
x = "Category",
y = "Number of Occurrences",
fill = "Category")
rm(same_diff_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3836301 204.9 7690634 410.8 7690634 410.8
## Vcells 44220030 337.4 168470437 1285.4 210545240 1606.4
initial_category_counts <- df_clean_ll %>%
group_by(Initial_Category) %>%
summarise(Count = n())
ggplot(initial_category_counts, aes(x = reorder(Initial_Category, -Count), y = Count, fill = Initial_Category)) +
geom_bar(stat = "identity") +
labs(title = "Distribution of Initial Categorizations",
x = "Initial Category",
y = "Number of Occurrences") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
rm(initial_category_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3842705 205.3 7690634 410.8 7690634 410.8
## Vcells 44235245 337.5 134776350 1028.3 210545240 1606.4
final_category_counts <- df_clean_ll %>%
group_by(Final_Category) %>%
summarise(Count = n())
ggplot(final_category_counts, aes(x = reorder(Final_Category, -Count), y = Count, fill = Final_Category)) +
geom_bar(stat = "identity") +
labs(title = "Distribution of Final Categorizations",
x = "Final Category",
y = "Number of Occurrences") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
rm(final_category_counts)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3841989 205.2 7690634 410.8 7690634 410.8
## Vcells 44233595 337.5 134776350 1028.3 210545240 1606.4
location_final_sample <- df_clean_ll %>%
select(Blurred_Longitude, Blurred_Latitude, Final_Category) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>% # Sample 1% of data
collect()
ggplot(location_final_sample, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = Final_Category)) +
geom_point(alpha = 0.6) +
labs(title = "Visualization of the Relationship Between Final Category and Call Location",
x = "Longitude",
y = "Latitude",
color = "Final Category") +
theme_minimal() +
theme(legend.position = "bottom")
rm(location_final_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3854064 205.9 7690634 410.8 7690634 410.8
## Vcells 88468372 675.0 163317422 1246.1 210545240 1606.4
location_final_sample <- df_clean_ll %>%
select(Blurred_Longitude, Blurred_Latitude, Sector) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>% # Sample 1% of data
collect()
ggplot(location_final_sample, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = Sector)) +
geom_point(alpha = 0.6) +
labs(title = "Visualization of the Relationship Between Sector and Call Location",
x = "Latitude",
y = "Longitude",
color = "Sector") +
theme_minimal() +
theme(legend.position = "bottom")
rm(location_final_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3859972 206.2 7690634 410.8 7690634 410.8
## Vcells 88488704 675.2 163969710 1251.0 210545240 1606.4
location_final_sample <- df_clean_ll %>%
select(Blurred_Longitude, Blurred_Latitude, Precinct) %>%
sdf_sample(fraction = 0.5, replacement = FALSE) %>% # Sample 1% of data
collect()
ggplot(location_final_sample, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = Precinct)) +
geom_point(alpha = 0.6) +
labs(title = "Visualization of the Relationship Between Police Precinct and Call Location",
x = "Latitude",
y = "Longitude",
color = "Police Precinct") +
theme_minimal() +
theme(legend.position = "bottom")
rm(location_final_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3852365 205.8 7690634 410.8 7690634 410.8
## Vcells 88462062 675.0 165235006 1260.7 210545240 1606.4
Arrival_Time feature to indicate whether the response was
fast or not, and then perform binary classification. To determine what
constitutes a fast response versus a slow one, a histogram of
five-minute intervals will first be presented to examine the
distribution of occurrences.df_times <- df_clean_ll %>%
mutate(
Response_Time = (unix_timestamp(Arrived_Time) - unix_timestamp(Original_Time_Queued)) / 60
)
# Plot histogram with 5-minute bins
df_times_filtered <- df_times %>% filter(Response_Time >= 0 & Response_Time <= 1000)
response_time_sample <- df_times_filtered %>%
select(Response_Time) %>%
sdf_sample(fraction = 0.1, replacement = FALSE) %>%
collect()
ggplot(response_time_sample, aes(x = Response_Time)) +
geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
labs(title = "Distribution of Response Time",
x = "Response Time (minutes)",
y = "Frequency")
rm(response_time_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3833415 204.8 7690634 410.8 7690634 410.8
## Vcells 44767490 341.6 132188005 1008.6 210545240 1606.4
Response_Speed was created with values 1 and 0. The values
of this feature are assigned based on the median response time -
Response_Time. All instances with a response time below the
median are assigned a Response_Speed value of 1,
representing a fast response, while the remaining instances are assigned
a value of 0, representing a slow response. The following code
demonstrates how this feature is created in the dataset:# Calculate median in Spark without collecting all data
median_result <- df_times_filtered %>%
summarise(median_time = percentile_approx(Response_Time, 0.5)) %>%
collect()
median_value <- median_result$median_time
# Create Response_Speed label
df_prepared <- df_times_filtered %>%
mutate(Response_Speed = if_else(Response_Time <= median_value, 1, 0))
response_speeds <- df_prepared %>%
group_by(Response_Speed) %>%
summarise(Count = n()) %>%
collect()
ggplot(response_speeds, aes(x = Response_Speed, y = Count, fill = Response_Speed)) +
geom_bar(stat = "identity") +
labs(title = "Distribution of Response Speed",
x = "Response Speed",
y = "Number of Occurrences",
fill = "Response Speed")
rm(response_speeds)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3847181 205.5 7690634 410.8 7690634 410.8
## Vcells 44237912 337.6 132188005 1008.6 210545240 1606.4
viz_sample <- df_prepared %>%
select(Priority, Initial_Category, Blurred_Longitude, Blurred_Latitude,
Sector, Response_Time, Response_Speed) %>%
sdf_sample(fraction = 0.05, replacement = FALSE) %>%
collect()
ggplot(viz_sample, aes(x = Priority, y = Response_Time)) +
geom_boxplot() +
labs(title = "Response Time vs. Priority", x = "Priority", y = "Response Time")
ggplot(viz_sample, aes(x = Initial_Category, y = Response_Time)) +
geom_boxplot() +
labs(title = "Response Time vs. Initial Category", x = "Initial Category", y = "Response Time") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(viz_sample, aes(x = Blurred_Longitude, y = Blurred_Latitude, color = Response_Speed)) +
geom_point() +
labs(title = "Response Speed vs. Location", x = "Longitude", y = "Latitude", color = "Response Time")
ggplot(viz_sample, aes(x = Sector, y = Response_Time)) +
geom_boxplot() +
labs(title = "Response Time vs. Sector", x = "Sector", y = "Response Time") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
rm(viz_sample)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3913845 209.1 7690634 410.8 7690634 410.8
## Vcells 52935258 403.9 132188005 1008.6 210545240 1606.4
Domestic Violence and Emergency and
Critical Incidents, also result in a quick response.Response_Time, Sector, Precinct,
Initial_Category, Longitude, and
Latitude.df_prepared_split <- df_prepared %>%
select(Response_Time, Response_Speed, Sector, Precinct, Initial_Category, Blurred_Longitude, Blurred_Latitude, Priority) %>%
sdf_random_split(training = 0.8,
test = 0.2,
seed = 100)
train_data <- df_prepared_split$training
test_data <- df_prepared_split$test